package test;

import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;

import java.util.List;

public class MyProcessor implements PageProcessor {

    //解析网页内容
    @Override
    public void process(Page page) {

       page.addTargetRequests(page.getHtml().regex("https://my.oschina.net/u/[0-9]+/blog/[0-9]+").all());

        String title = page.getHtml().xpath("//*[@id=\"mainScreen\"]/div/div[1]/div/div[2]/div[1]/div[2]/h2/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"articleContent\"]").toString();

        if (StringUtils.isNoneBlank(title)&&StringUtils.isNoneBlank(content)){
            page.putField("title", title);
            page.putField("content",content);
        }else {
            page.setSkip(true);
        }

    }

    //设置参数
    @Override
    public Site getSite() {
        return Site.me()
                        .setSleepTime(100) //爬取间隔时间
                        .setTimeOut(3000)   //爬取超时时间
                        .setRetryTimes(3) ;//重试次数
    }

    //入口
    public static void main(String[] args) {
        //
        Spider.create(new MyProcessor())
                .addUrl("https://www.oschina.net/blog")
                .addPipeline(new FilePipeline("D:\\temp\\file1\\"))
                .setScheduler(new RedisScheduler("192.168.207.111"))
                .start();
    }
}
